# List all objects in the environment
objects_to_remove <- ls()

# Remove all objects
rm(list = objects_to_remove)
#install.packages(c("forecast", "expsmooth", "seasonal")) 
library(TTR)
library(forecast) 
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tseries) 
library(expsmooth) 
library(fpp2)
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.5 ──
## ✔ ggplot2 3.4.2     ✔ fma     2.5
## 
library(seasonal)
library(MASS)
## 
## Attaching package: 'MASS'
## The following objects are masked from 'package:fma':
## 
##     cement, housing, petrol
library(stats)
library(TSA)
## Registered S3 methods overwritten by 'TSA':
##   method       from    
##   fitted.Arima forecast
##   plot.Arima   forecast
## 
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
## 
##     acf, arima
## The following object is masked from 'package:utils':
## 
##     tar
library(forecast)
library(ggplot2)
library(tseries)
library(imputeTS)
## 
## Attaching package: 'imputeTS'
## The following object is masked from 'package:tseries':
## 
##     na.remove
library(vars)
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following object is masked from 'package:imputeTS':
## 
##     na.locf
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: urca
## Loading required package: lmtest
library(timetk)
library(Metrics)
## 
## Attaching package: 'Metrics'
## The following object is masked from 'package:forecast':
## 
##     accuracy
library(lmtest)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(xts)
library(dplyr)
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
## 
##     first, last
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 0x0006): Library not loaded: /opt/X11/lib/libSM.6.dylib
##   Referenced from: <FF564E7B-F7DD-3BAE-972C-DE65F8735FC9> /Library/Frameworks/R.framework/Versions/4.2/Resources/modules/R_X11.so
##   Reason: tried: '/opt/X11/lib/libSM.6.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/X11/lib/libSM.6.dylib' (no such file), '/opt/X11/lib/libSM.6.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libSM.6.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk1.8.0_241.jdk/Contents/Home/jre/lib/server/libSM.6.dylib' (no such file)
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)),
## stdout = TRUE): running command ''/usr/bin/otool' -L
## '/Library/Frameworks/R.framework/Resources/library/tcltk/libs//tcltk.so'' had
## status 1
## Could not load tcltk.  Will use slower R code instead.
## Loading required package: RSQLite

Step 1: Load data and plot the time series

path <- "/Users/nikithagopal/Desktop/MSCA Data/Time Series Final/"
pm2_5_data <- read.csv(paste0(path, "la_pm2_5_pollutant_data.csv"), 
                     na.strings=c("", "NA"))
ozone_data <- read.csv(paste0(path, "la_ozone_pollutant_data.csv"), 
                     na.strings=c("", "NA"))

#path <- "/Users/sydneypeters/Desktop/"
#pm2_5_data <- read.csv(paste0(path, "pm2.5_LA.csv"), 
#                     na.strings=c("", "NA"))
#ozone_data <- read.csv(paste0(path, "ozone_LA.csv"), 
#                     na.strings=c("", "NA"))

# Convert data into ts objects
pm2_5_ts <- ts(pm2_5_data$PM2.5.AQI.Value, start = c(1999,1,1), frequency = 365.25)
ozone_ts <- ts(ozone_data$Ozone.AQI.Value, start = c(1999,1,1), frequency = 365.25)

plot(pm2_5_ts, main="Los Angeles PM 2.5 AQI")

plot(ozone_ts, main="Los Angeles Ozone AQI")

Step 2: Check if data is stationary (KPSS/ADF Test)

kpss.test(pm2_5_ts)
## Warning in kpss.test(pm2_5_ts): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  pm2_5_ts
## KPSS Level = 6.951, Truncation lag parameter = 12, p-value = 0.01
# The reported p-value is 0.01, which is smaller than 0.05, and would suggest 
# that we reject the null hypothesis of level stationarity and conclude that 
# there is evidence that the data is non-stationary

adf.test(pm2_5_ts)
## Warning in adf.test(pm2_5_ts): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  pm2_5_ts
## Dickey-Fuller = -14.554, Lag order = 20, p-value = 0.01
## alternative hypothesis: stationary
# Similarly the Augumented Dickey-Fuller test results in p-value of 0.01 (<0.05) 
# where we do reject the null hypothesis that the time series is non-stationary
# and thus data is stationary.

# These are opposite results so we use ACF PACF plots to see if the series is stationary
acf(pm2_5_ts, main = "ACF of Original Time Series")

pacf(pm2_5_ts, main = "PACF of Original Time Series")

#### Step 3: Prepare data for Weekly & Monthly extraction and EDA

# Create data with Dates
pm2_5_df <- as.data.frame(pm2_5_ts)
pm2_5_df$Date <- mdy(pm2_5_data$Date)
colnames(pm2_5_df) <- c("PM2.5.AQI.Value", "Date")

# Convert to xts format for weekly & monthly
pm2_5_xts <- as.xts(pm2_5_df, order.by=pm2_5_df$Date)
pm2_5_xts <- pm2_5_xts[,-2]

Step 4: Understand PM2.5 AQI variance across years

# Let's see how seasonality looks over time and is the variance changing
pm2_5_df$Month <- month(pm2_5_df$Date)
pm2_5_df$Year <- year(pm2_5_df$Date)

avg_aqi_by_month_year <- pm2_5_df %>%
  group_by(pm2_5_df$Year, pm2_5_df$Month) %>%
  summarise(
    avg_value = mean(PM2.5.AQI.Value)
  )
## `summarise()` has grouped output by 'pm2_5_df$Year'. You can override using the
## `.groups` argument.
colnames(avg_aqi_by_month_year) <- c("Year", "Month", "avg_value")

reshape_avg_aqi_by_month_year <- 
  sqldf(
    "SELECT 
      Month,
      MAX(CASE WHEN Year = 1999 THEN avg_value END) AS Year_1999,
      MAX(CASE WHEN Year = 2000 THEN avg_value END) AS Year_2000,
      MAX(CASE WHEN Year = 2001 THEN avg_value END) AS Year_2001,
      MAX(CASE WHEN Year = 2002 THEN avg_value END) AS Year_2002,
      MAX(CASE WHEN Year = 2003 THEN avg_value END) AS Year_2003,
      MAX(CASE WHEN Year = 2004 THEN avg_value END) AS Year_2004,
      MAX(CASE WHEN Year = 2005 THEN avg_value END) AS Year_2005,
      MAX(CASE WHEN Year = 2006 THEN avg_value END) AS Year_2006,
      MAX(CASE WHEN Year = 2007 THEN avg_value END) AS Year_2007,
      MAX(CASE WHEN Year = 2008 THEN avg_value END) AS Year_2008,
      MAX(CASE WHEN Year = 2009 THEN avg_value END) AS Year_2009,
      MAX(CASE WHEN Year = 2010 THEN avg_value END) AS Year_2010,
      MAX(CASE WHEN Year = 2011 THEN avg_value END) AS Year_2011,
      MAX(CASE WHEN Year = 2012 THEN avg_value END) AS Year_2012,
      MAX(CASE WHEN Year = 2013 THEN avg_value END) AS Year_2013,
      MAX(CASE WHEN Year = 2014 THEN avg_value END) AS Year_2014,
      MAX(CASE WHEN Year = 2015 THEN avg_value END) AS Year_2015,
      MAX(CASE WHEN Year = 2016 THEN avg_value END) AS Year_2016,
      MAX(CASE WHEN Year = 2017 THEN avg_value END) AS Year_2017,
      MAX(CASE WHEN Year = 2018 THEN avg_value END) AS Year_2018,
      MAX(CASE WHEN Year = 2019 THEN avg_value END) AS Year_2019,
      MAX(CASE WHEN Year = 2020 THEN avg_value END) AS Year_2020,
      MAX(CASE WHEN Year = 2021 THEN avg_value END) AS Year_2021,
      MAX(CASE WHEN Year = 2022 THEN avg_value END) AS Year_2022,
      MAX(CASE WHEN Year = 2023 THEN avg_value END) AS Year_2023
    FROM avg_aqi_by_month_year
    GROUP BY Month
    ORDER BY Month"
  )

colnames(reshape_avg_aqi_by_month_year) <- c(
  "Month", "1999", "2000", "2001", "2002", "2003", "2004", "2005", "2006", 
  "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", 
  "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023")

boxplot(reshape_avg_aqi_by_month_year[2:26])

# We can see that over the years there is downward trend and variance is decreasing
# except 2020 when we see a high peak likely caused by wildfires
# Link: https://en.wikipedia.org/wiki/Lake_Fire_(2020)

Step 5a: Extract values for time series and plot the series - Weekly

pm2_5_weekly <- apply.weekly(pm2_5_xts, mean)
pm2_5_weekly_ts <- ts(pm2_5_weekly["19990103/20230811"], 
                      start = c(1999,1),
                      frequency = 52.18)
plot(pm2_5_weekly_ts)

# Strong seasonality, Strong cyclic, light downward trend, variance is reducing

kpss.test(pm2_5_weekly_ts)
## Warning in kpss.test(pm2_5_weekly_ts): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  pm2_5_weekly_ts
## KPSS Level = 3.4904, Truncation lag parameter = 7, p-value = 0.01
adf.test(pm2_5_weekly_ts)
## Warning in adf.test(pm2_5_weekly_ts): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  pm2_5_weekly_ts
## Dickey-Fuller = -8.4636, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
acf(pm2_5_weekly_ts, main = "ACF of Weekly Time Series")

pacf(pm2_5_weekly_ts, main = "PACF of Weekly Time Series")

Step 6a: Split data into train and test - Weekly

# Split the data into a training and test dataset
train_weekly <- window(pm2_5_weekly_ts, start = c(1999,1), end=c(2020,52))
test_weekly <- window(pm2_5_weekly_ts, start = c(2021,1))

Step 7a: Decompose the time series plot - Weekly

# Looking at spectral analysis
periodogram(train_weekly, log = "no", plot=TRUE, 
            ylab = "Periodogram",
            xlab = "Frequency",
            lwd=2, xlim = c(0, 0.06))

# There are two trends: 
#  1) A typical yearly (52 weeks) seasonal trend
#  2) A trend that is repeating every 9.6 years (500 weeks)
#  3) A typical half yearly (26 weeks) seasonal trend

# Overall, there is no mixed effect that normal seasonality model cannot capture.

# Decompose the series
plot(decompose(train_weekly, type="multiplicative"))

# Important Inferences
# 1) The PM2.5 AQI has been decreasing overall though there are rise every now and then.
#    However the trend is going down with time.
# 2) Winter months have a strong seasonal effect with Nov and Dec being the peak months
#    usually which likely could be due to cold temperatures causing pollutant to
#    not escape the lower atmosphere.
#    Link: https://www.accuweather.com/en/health-wellness/why-air-pollution-is-worse-in-winter/689434#:~:text=Cold%20air%20is%20denser%20and%20moves%20slower%20than%20warm%20air,rate%20than%20during%20the%20summer.
# 3) We can see a seasonal cycle of 12 months where the mean value of each month starts 
#    with a increasing trend in the beginning of the year and drops down towards 
#    the end of the year. We can see a seasonal effect with a cycle of 12 months.


# Understand the seasonality and remove it to see the trend
train_weekly_diff <- diff(train_weekly, lag = 52)
autoplot(train_weekly_diff, ylab="Train Seasonal Differencing - Weekly Data")

# Let's look if the series is stationary?
kpss.test(train_weekly_diff)
## Warning in kpss.test(train_weekly_diff): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  train_weekly_diff
## KPSS Level = 0.17702, Truncation lag parameter = 7, p-value = 0.1
# The reported p-value is 0.1, which is > 0.05, and would suggest that we fail 
# to reject the null hypothesis of level stationarity (conclusion: stationary)

adf.test(train_weekly_diff)
## Warning in adf.test(train_weekly_diff): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_weekly_diff
## Dickey-Fuller = -8.5534, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
# The reported p-value of 0.01 (<0.05) so we do reject the null hypothesis that 
# the time series is non-stationary (conclusion: stationary)

acf(train_weekly_diff, main = "ACF of Seasonal Differencing Time Series - Weekly Data")

pacf(train_weekly_diff, main = "PACF of Seasonal Differencing Time Series - Weekly Data")

Step 8a: Benchmark using snaive model - Weekly

# Forecast the seasonal naïve for (2021-01 to 2023-07)
forecast_snaive_weekly <- snaive(train_weekly, h=110)
## Warning in lag.default(y, -lag): 'k' is not an integer
# Plot the forecasts for snaive model
plot(forecast_snaive_weekly, main = "PM 2.5 AQI Forecast - SNaive (Weekly)",
         xlab = "Week", ylab = "PM 2.5 AQI")
lines(test_weekly)

# Compare the forecast with the actual test data by calculating MAPE and MSE
# Mean Absolute Percentage Error (MAPE)
mape_snaive_weekly <- mean(abs((test_weekly - forecast_snaive_weekly$mean)/test_weekly))
mape_snaive_weekly
## [1] 0.2749545
# Mean Absolute Error (MAE)
mae_snaive_weekly <- mean(abs((test_weekly - forecast_snaive_weekly$mean)))
mae_snaive_weekly
## [1] 16.28571
# Mean Squared Error (MSE)
mse_snaive_weekly <- mean((test_weekly - forecast_snaive_weekly$mean)^2)
mse_snaive_weekly
## [1] 599.6278
# Evaluate the residuals
checkresiduals(forecast_snaive_weekly)

## 
##  Ljung-Box test
## 
## data:  Residuals from Seasonal naive method
## Q* = 509.98, df = 104.36, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 104.36

Step 5b: Extract values for time series and plot the series - Monthly

# Create monthly data
pm2_5_monthly <- apply.monthly(pm2_5_xts, mean)

pm2_5_monthly_ts <- ts(pm2_5_monthly["19990131/20230811"], start = c(1999,1), frequency = 12)
plot(pm2_5_monthly_ts)

kpss.test(pm2_5_monthly_ts)
## Warning in kpss.test(pm2_5_monthly_ts): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  pm2_5_monthly_ts
## KPSS Level = 1.9774, Truncation lag parameter = 5, p-value = 0.01
adf.test(pm2_5_monthly_ts)
## Warning in adf.test(pm2_5_monthly_ts): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  pm2_5_monthly_ts
## Dickey-Fuller = -7.4466, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
acf(pm2_5_monthly_ts, main = "ACF of Monthly Time Series")

pacf(pm2_5_monthly_ts, main = "PACF of Monthly Time Series")

Step 6b: Split data into train and test - Monthly

# Split the data into a training and test dataset
train_monthly <- window(pm2_5_monthly_ts, start = c(1999,1), end=c(2020,12))
test_monthly <- window(pm2_5_monthly_ts, start = c(2021,1))

Step 7b: Decompose the time series plot - Monthly

# Looking at spectral analysis
periodogram(train_monthly, log = "no", plot=TRUE, 
            ylab = "Periodogram",
            xlab = "Frequency",
            lwd=2, xlim = c(0, 0.2))

# There are two trends: 
#  1) A typical yearly (12 months) seasonal trend
#  2) A trend that is repeating every 8-9 years (100 months)
#  3) A typical half yearly (6 months) seasonal trend

# Overall, there is no mixed effect that normal seasonality model cannot capture.

# Decompose the series
plot(decompose(train_monthly, type="multiplicative"))

# Important Inferences
# 1) The PM2.5 AQI has been decreasing overall though there are rise every now and then.
#    However the trend is going down with time.
# 2) Winter months have a strong seasonal effect with Nov and Dec being the peak months
#    usually which likely could be due to cold temperatures causing pollutant to
#    not escape the lower atmosphere.
#    Link: https://www.accuweather.com/en/health-wellness/why-air-pollution-is-worse-in-winter/689434#:~:text=Cold%20air%20is%20denser%20and%20moves%20slower%20than%20warm%20air,rate%20than%20during%20the%20summer.
# 3) We can see a seasonal cycle of 12 months where the mean value of each month starts 
#    with a increasing trend in the beginning of the year and drops down towards 
#    the end of the year. We can see a seasonal effect with a cycle of 12 months.

# Understand the seasonality and remove it to see the trend
train_monthly_diff <- diff(train_monthly, lag = 12)
autoplot(train_monthly_diff, ylab="Train Seasonal Differencing (Monthly)")

# Let's look if the series is stationary?
kpss.test(train_monthly_diff)
## Warning in kpss.test(train_monthly_diff): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  train_monthly_diff
## KPSS Level = 0.14573, Truncation lag parameter = 5, p-value = 0.1
# The reported p-value is 0.1, which is > 0.05, and would suggest that we fail 
# to reject the null hypothesis of level stationarity (conclusion: stationary)

adf.test(train_monthly_diff)
## Warning in adf.test(train_monthly_diff): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_monthly_diff
## Dickey-Fuller = -5.1056, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
# The reported p-value of 0.01 (<0.05) so we do reject the null hypothesis that 
# the time series is non-stationary (conclusion: stationary)

acf(train_monthly_diff, main = "ACF of Seasonal Differencing Time Series")

pacf(train_monthly_diff, main = "PACF of Seasonal Differencing Time Series")

Step 8b: Benchmark using snaive model - Monthly

# Forecast the seasonal naïve for (2021-01 to 2023-07)
forecast_snaive_monthly <- snaive(train_monthly, h=31)
# Plot the forecasts for snaive model
plot(forecast_snaive_monthly, main = "PM 2.5 AQI Forecast - SNaive",
         xlab = "Week", ylab = "PM 2.5 AQI")
lines(test_monthly)

# Compare the forecast with the actual test data by calculating MAPE and MSE
# Mean Absolute Percentage Error (MAPE)
mape_snaive <- mean(abs((test_monthly - forecast_snaive_monthly$mean)/test_monthly))
mape_snaive
## [1] 0.20778
# Mean Squared Error (MSE)
mse_snaive <- mean((test_monthly - forecast_snaive_monthly$mean)^2)
mse_snaive
## [1] 282.8901
# Evaluate the residuals
checkresiduals(forecast_snaive_monthly)

## 
##  Ljung-Box test
## 
## data:  Residuals from Seasonal naive method
## Q* = 82.387, df = 24, p-value = 2.523e-08
## 
## Model df: 0.   Total lags used: 24

Step 9a: TBATS model - Weekly

vec <- as.vector(train_weekly)

univariate_ts <- ts(vec, start=1999, frequency=52.18)

stl_decomposition <- stl(univariate_ts, s.window="periodic")
plot(stl_decomposition)

h <- length(test_weekly)

model_tbats <- tbats(train_weekly)
forecast_tbats <- forecast(model_tbats, h=h)
plot(forecast_tbats)

checkresiduals(model_tbats)

## 
##  Ljung-Box test
## 
## data:  Residuals from TBATS
## Q* = 128.92, df = 104.36, p-value = 0.05179
## 
## Model df: 0.   Total lags used: 104.36
#Ljung-Box test .05179

plot(pm2_5_weekly_ts, xlim=c(1999, 2023), ylim=c(min(pm2_5_weekly_ts), max(pm2_5_weekly_ts)), main="Train, Test, and Forecasted Data - Weekly", xlab="Year", ylab="PM2.5 AQI Value")
lines(test_weekly, col="red") # Test data in red
lines(forecast_tbats$mean, col="blue", type="o") # Forecasted data in blue
legend("topleft", legend=c("Train", "Test", "Forecast"), fill=c("black", "red", "blue"))

mae <- mean(abs(test_weekly - forecast_tbats$mean))
print(cat("Mean Absolute Error (MAE):", round(mae, 2), "\n"))
## Mean Absolute Error (MAE): 9.25 
## NULL
#Mean Absolute Error (MAE): 9.31 

comparison_df <- data.frame(
  Date = time(test_weekly),
  Actual = as.vector(test_weekly),
  Forecasted = as.vector(forecast_tbats$mean)
)
print(comparison_df)
##         Date    Actual Forecasted
## 1   2021.001  91.57143   79.76285
## 2   2021.020  83.28571   81.90778
## 3   2021.039  77.28571   80.15252
## 4   2021.058  52.57143   75.73405
## 5   2021.077  46.42857   69.93137
## 6   2021.097  65.57143   64.53847
## 7   2021.116  64.57143   60.93566
## 8   2021.135  54.57143   59.52249
## 9   2021.154  58.42857   59.74437
## 10  2021.173  52.14286   60.36729
## 11  2021.192  43.85714   60.06514
## 12  2021.212  45.14286   58.25418
## 13  2021.231  50.85714   55.52292
## 14  2021.250  60.42857   53.17368
## 15  2021.269  63.14286   52.36580
## 16  2021.288  62.71429   53.55979
## 17  2021.307  55.00000   56.32865
## 18  2021.327  54.00000   59.41406
## 19  2021.346  68.14286   61.25017
## 20  2021.365  62.85714   60.99757
## 21  2021.384  56.14286   59.24536
## 22  2021.403  63.14286   57.57820
## 23  2021.422  60.71429   57.52748
## 24  2021.442  52.71429   59.84829
## 25  2021.461  67.28571   64.22669
## 26  2021.480  55.85714   69.20070
## 27  2021.499  73.85714   72.60041
## 28  2021.518  81.85714   72.82056
## 29  2021.537  61.71429   70.03912
## 30  2021.557  68.85714   66.01692
## 31  2021.576  59.14286   62.75970
## 32  2021.595  63.28571   61.47326
## 33  2021.614  58.14286   62.33043
## 34  2021.633  64.00000   64.64065
## 35  2021.652  71.42857   67.16719
## 36  2021.672  69.42857   68.67033
## 37  2021.691  63.42857   68.55306
## 38  2021.710  62.85714   67.13541
## 39  2021.729  70.14286   65.32834
## 40  2021.748  57.85714   64.06076
## 41  2021.767  52.57143   63.90400
## 42  2021.787  67.00000   64.98187
## 43  2021.806  50.14286   67.02575
## 44  2021.825  61.00000   69.47144
## 45  2021.844 107.14286   71.60396
## 46  2021.863  73.42857   72.79015
## 47  2021.882  98.85714   72.76263
## 48  2021.901  63.42857   71.80027
## 49  2021.921 125.71429   70.64740
## 50  2021.940  82.57143   70.18538
## 51  2021.959  65.00000   71.02913
## 52  2021.978  74.42857   73.19265
## 53  2021.997  58.14286   75.89627
## 54  2022.016  83.71429   77.68823
## 55  2022.036  64.42857   77.12026
## 56  2022.055  64.57143   73.75393
## 57  2022.074  63.71429   68.59693
## 58  2022.093  69.57143   63.44149
## 59  2022.112  64.71429   59.78428
## 60  2022.131  58.85714   58.21952
## 61  2022.151  49.14286   58.38302
## 62  2022.170  55.42857   59.16884
## 63  2022.189  56.14286   59.23610
## 64  2022.208  51.71429   57.82145
## 65  2022.227  57.42857   55.30064
## 66  2022.246  50.57143   52.89422
## 67  2022.266  63.42857   51.83215
## 68  2022.285  44.57143   52.72377
## 69  2022.304  43.85714   55.31528
## 70  2022.323  62.71429   58.49065
## 71  2022.342  61.14286   60.68321
## 72  2022.361  49.28571   60.84876
## 73  2022.381  56.42857   59.31154
## 74  2022.400  58.85714   57.54305
## 75  2022.419  60.14286   57.14814
## 76  2022.438  61.28571   59.05077
## 77  2022.457  56.28571   63.14531
## 78  2022.476  62.00000   68.17263
## 79  2022.496  59.14286   72.02334
## 80  2022.515  76.14286   72.87993
## 81  2022.534  56.14286   70.56173
## 82  2022.553  57.85714   66.62489
## 83  2022.572  53.00000   63.12852
## 84  2022.591  50.28571   61.45866
## 85  2022.611  56.85714   61.96773
## 86  2022.630  57.42857   64.10343
## 87  2022.649  56.85714   66.69659
## 88  2022.668  62.28571   68.45928
## 89  2022.687  53.00000   68.64162
## 90  2022.706  50.85714   67.40666
## 91  2022.726  40.71429   65.60068
## 92  2022.745  56.71429   64.18580
## 93  2022.764  65.14286   63.81107
## 94  2022.783  58.00000   64.67913
## 95  2022.802  54.42857   66.58590
## 96  2022.821  58.85714   69.01204
## 97  2022.841  52.85714   71.25557
## 98  2022.860  50.57143   72.64784
## 99  2022.879  64.42857   72.83669
## 100 2022.898  77.71429   71.99981
## 101 2022.917  60.28571   70.81647
## 102 2022.936  54.28571   70.17340
## 103 2022.956  61.28571   70.76115
## 104 2022.975  99.00000   72.72148
## 105 2022.994  51.42857   75.42154
## 106 2023.013  48.42857   77.49853
## 107 2023.032  51.00000   77.42970
## 108 2023.051  58.00000   74.53271
## 109 2023.071  61.85714   69.57444
## 110 2023.090  60.14286   64.28845
## 111 2023.109  54.71429   60.28911
## 112 2023.128  55.42857   58.34891
## 113 2023.147  47.71429   58.26621
## 114 2023.166  48.14286   59.04205
## 115 2023.186  39.42857   59.32318
## 116 2023.205  48.57143   58.18239
## 117 2023.224  42.42857   55.78515
## 118 2023.243  49.14286   53.25692
## 119 2023.262  49.14286   51.88511
## 120 2023.281  63.85714   52.41753
## 121 2023.300  61.85714   54.76253
## 122 2023.320  70.28571   57.94557
## 123 2023.339  31.71429   60.42250
## 124 2023.358  52.42857   60.97324
## 125 2023.377  63.71429   59.65558
## 126 2023.396  44.57143   57.80028
## 127 2023.415  46.00000   57.06366
## 128 2023.435  38.42857   58.52809
## 129 2023.454  46.85714   62.28755
## 130 2023.473  50.85714   67.28797
## 131 2023.492  62.14286   71.51375
## 132 2023.511  88.85714   72.98177
## 133 2023.530  58.71429   71.15887
## 134 2023.550  60.42857   67.35630
## 135 2023.569  65.14286   63.64954
## 136 2023.588  62.14286   61.59527
## 137 2023.607  54.60000   61.72969

Step 9b: TBATS model - Monthly

vec <- as.vector(train_monthly)

univariate_ts <- ts(vec, start=1999, frequency=52.18)

stl_decomposition <- stl(univariate_ts, s.window="periodic")
plot(stl_decomposition)

h <- length(test_monthly)

model_tbats <- tbats(train_monthly)
forecast_tbats <- forecast(model_tbats, h=h)
plot(forecast_tbats)

checkresiduals(model_tbats)

## 
##  Ljung-Box test
## 
## data:  Residuals from TBATS
## Q* = 34.295, df = 24, p-value = 0.07956
## 
## Model df: 0.   Total lags used: 24
#Ljung-Box test .07956

plot(pm2_5_monthly_ts, xlim=c(1999, 2023), ylim=c(min(pm2_5_monthly_ts), max(pm2_5_monthly_ts)), main="Train, Test, and Forecasted Data - Monthly", xlab="Year", ylab="PM2.5 AQI Value")
lines(test_monthly, col="red") # Test data in red
lines(forecast_tbats$mean, col="blue", type="o") # Forecasted data in blue
legend("topleft", legend=c("Train", "Test", "Forecast"), fill=c("black", "red", "blue"))

mae <- mean(abs(test_monthly - forecast_tbats$mean))
print(cat("Mean Absolute Error (MAE):", round(mae, 2), "\n"))
## Mean Absolute Error (MAE): 6.7 
## NULL
#Mean Absolute Error (MAE): 6.61

comparison_df <- data.frame(
  Date = time(test_monthly),
  Actual = as.vector(test_monthly),
  Forecasted = as.vector(forecast_tbats$mean)
)
print(comparison_df)
##        Date   Actual Forecasted
## 1  2021.000 69.25806   70.59917
## 2  2021.083 60.78571   62.21158
## 3  2021.167 49.09677   56.78048
## 4  2021.250 58.90000   56.35278
## 5  2021.333 62.45161   60.29386
## 6  2021.417 59.50000   65.34224
## 7  2021.500 69.90323   67.73135
## 8  2021.583 64.67742   67.04131
## 9  2021.667 64.93333   66.37212
## 10 2021.750 57.61290   68.28974
## 11 2021.833 86.50000   71.88886
## 12 2021.917 80.35484   73.11090
## 13 2022.000 69.90323   69.00588
## 14 2022.083 60.14286   61.88329
## 15 2022.167 53.48387   56.71088
## 16 2022.250 53.73333   56.33676
## 17 2022.333 56.87097   60.28989
## 18 2022.417 60.23333   65.34124
## 19 2022.500 60.32258   67.73111
## 20 2022.583 55.70968   67.04126
## 21 2022.667 52.30000   66.37211
## 22 2022.750 58.70968   68.28973
## 23 2022.833 61.63333   71.88886
## 24 2022.917 66.22581   73.11090
## 25 2023.000 53.80645   69.00588
## 26 2023.083 54.42857   61.88329
## 27 2023.167 44.80645   56.71088
## 28 2023.250 60.83333   56.33676
## 29 2023.333 46.96774   60.28989
## 30 2023.417 48.60000   65.34124
## 31 2023.500 68.41935   67.73111
## 32 2023.583 58.36364   67.04126

Step 10a: ARIMA model using auto.arima() - Weekly

eacf <-
function (z,ar.max=7,ma.max=13) 
{
#
#  PROGRAMMED BY K.S. CHAN, DEPARTMENT OF STATISTICS AND ACTUARIAL SCIENCE,
#  UNIVERSITY OF IOWA.
#
#  DATE: 4/2001
#  Compute the extended sample acf (ESACF) for the time series stored in z.
#  The matrix of ESACF with the AR order up to ar.max and the MA order
#  up to ma.max is stored in the matrix EACFM.
#  The default values for NAR and NMA are 7 and 13 respectively.
#  Side effect of the eacf function:
#  The function prints a coded ESACF table with
#  significant values denoted by * and nosignificant values by 0, significance
#  level being 5%.
#
#  Output:
#   eacf=matrix of esacf
#   symbol=matrix of coded esacf
#

lag1<-function(z,lag=1){c(rep(NA,lag),z[1:(length(z)-lag)])}
reupm<-function(m1,nrow,ncol){
k<-ncol-1
m2<-NULL
for (i in 1:k){
i1<-i+1
work<-lag1(m1[,i])
work[1]<--1
temp<-m1[,i1]-work*m1[i1,i1]/m1[i,i]
temp[i1]<-0
m2<-cbind(m2,temp)
}
m2}
ceascf<-function(m,cov1,nar,ncol,count,ncov,z,zm){
result<-0*seq(1,nar+1)
result[1]<-cov1[ncov+count]
for (i in 1:nar) {
temp<-cbind(z[-(1:i)],zm[-(1:i),1:i])%*%c(1,-m[1:i,i])
result[i+1]<-acf(temp,plot=FALSE,lag.max=count,drop.lag.0=FALSE)$acf[count+1]
}
result
}

ar.max<-ar.max+1
ma.max<-ma.max+1
nar<-ar.max-1
nma<-ma.max
ncov<-nar+nma+2
nrow<-nar+nma+1
ncol<-nrow-1
z<-z-mean(z)
zm<-NULL
for(i in 1:nar) zm<-cbind(zm,lag1(z,lag=i))
cov1<-acf(z,lag.max=ncov,plot=FALSE,drop.lag.0=FALSE)$acf
cov1<-c(rev(cov1[-1]),cov1)
ncov<-ncov+1
m1<-matrix(0,ncol=ncol,nrow=nrow)
for(i in 1:ncol) m1[1:i,i]<-
ar.ols(z,order.max=i,aic=FALSE,demean=FALSE,intercept=FALSE)$ar
eacfm<-NULL
for (i in 1:nma) {
m2<-reupm(m1=m1,nrow=nrow,ncol=ncol)
ncol<-ncol-1
eacfm<-cbind(eacfm, ceascf(m2,cov1,nar,ncol,i,ncov,z,zm))
m1<-m2}
work<-1:(nar+1)
work<-length(z)-work+1
symbol<-NULL
for ( i in 1:nma) {
work<-work-1
symbol<-cbind(symbol,ifelse(abs(eacfm[,i])>2/work^.5, 'x','o'))}
rownames(symbol)<-0:(ar.max-1)
colnames(symbol)<-0:(ma.max-1)
cat('AR/MA\n')
print(symbol,quote=FALSE)
invisible(list(eacf=eacfm,ar.max=ar.max,ma.ma=ma.max,symbol=symbol))
}
eacf(train_weekly)
## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x x x x x x x x x  o  o  o 
## 1 x x o o o o o x x o x  o  o  o 
## 2 x x o o o o o o x x x  o  o  o 
## 3 x x o o o o o o o o o  o  o  o 
## 4 x x x o o o o o o o o  o  o  o 
## 5 x x x x o o o o o o o  o  o  o 
## 6 x x o x o o o o o o o  o  o  o 
## 7 x x o x o x o o o o o  x  o  o
# p <- 1,2,3
# q <- 2,3,4
kpss_test_result <- kpss.test(train_weekly_diff)
## Warning in kpss.test(train_weekly_diff): p-value greater than printed p-value
adf_test_result <- adf.test(train_weekly_diff)
## Warning in adf.test(train_weekly_diff): p-value smaller than printed p-value
cat("KPSS: ", round(kpss_test_result$p.value, 2), "\n")
## KPSS:  0.1
cat("ADF: ", round(adf_test_result$p.value, 2), "\n")
## ADF:  0.01
# Fit ARIMA model using auto.arima()
# Find appropriate values for d and D using ndiffs and nsdiffs
# d <- ndiffs(train_weekly_diff)
# D <- nsdiffs(train_weekly_diff)

# Fit ARIMA model using auto.arima() with seasonal components and previously found d and D
arima_model_weekly <- auto.arima(train_weekly, seasonal = TRUE, stepwise = FALSE,
                                 approximation = FALSE, trace = FALSE, lambda = "auto",
                                 start.p = 1, max.p = 3,
                                 start.q = 2, max.q = 4)

# auto.arima(pm2_5_weekly_diff, lambda = "auto", seasonal=TRUE, )

# Forecast using ARIMA model
h <- length(test_weekly)
forecast_arima_weekly <- forecast(arima_model_weekly, h = h)


plot(forecast_arima_weekly, main = "PM 2.5 AQI Forecast - sARIMA",
         xlab = "Week", ylab = "PM 2.5 AQI")
lines(test_weekly)

# Check the residuals of the ARIMA model
checkresiduals(arima_model_weekly)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,2)
## Q* = 187.38, df = 101.36, p-value = 4.411e-07
## 
## Model df: 3.   Total lags used: 104.36
# Extracting forecasted values
forecasted_values <- forecast_arima_weekly$mean

# Calculate MAE
mae <- mean(abs(forecasted_values - test_weekly))
# Calculate MAPE
mape <- mean(abs((forecasted_values - test_weekly) / test_weekly)) * 100

# Get AICc and BIC values from the ARIMA model
aic <- arima_model_weekly$aic
bic <- arima_model_weekly$bic

# Corrected AICc calculation
n <- length(train_weekly_diff)
k <- length(arima_model_weekly$coef)
aic_corrected <- aic + 2 * (k + 1) * (k + 2) / (n - k - 2)

# Print the results
cat("MAE: ", round(mae, 2), "\n")
## MAE:  9.46
cat("MAPE: ", round(mape, 2), "%\n")
## MAPE:  16.5 %
cat("AICc: ", round(aic_corrected, 2), "\n")
## AICc:  -6034.44
cat("BIC: ", round(bic, 2), "\n")
## BIC:  -6014.3
# Create a time index for the forecasted values
forecast_time_index <- seq(from = end(test_weekly) + 1 / 52, 
                           length.out = length(forecasted_values), by = 1 / 52)

# Plot historical data, actual test, and forecast
plot(train_weekly_diff, type = "l", col = "black", 
     xlab = "Week", ylab = "PM 2.5 AQI",
     main = "PM 2.5 AQI Forecast - sARIMA")
lines(test_weekly, col = "blue")
lines(forecast_time_index, forecasted_values, col = "red")

# Adding legend
legend("topright", legend = c("Historical Data", "Actual Test", "Forecast"), 
       col = c("black", "blue", "red"), lty = 1)

comparison_df <- data.frame(
  Date = time(test_weekly),
  Actual = as.vector(test_weekly),
  Forecasted = as.vector(forecasted_values)
)
print(comparison_df)
##         Date    Actual Forecasted
## 1   2021.001  91.57143   70.55457
## 2   2021.020  83.28571   68.49195
## 3   2021.039  77.28571   67.04783
## 4   2021.058  52.57143   66.02520
## 5   2021.077  46.42857   65.29516
## 6   2021.097  65.57143   64.77095
## 7   2021.116  64.57143   64.39297
## 8   2021.135  54.57143   64.11960
## 9   2021.154  58.42857   63.92145
## 10  2021.173  52.14286   63.77760
## 11  2021.192  43.85714   63.67305
## 12  2021.212  45.14286   63.59699
## 13  2021.231  50.85714   63.54164
## 14  2021.250  60.42857   63.50133
## 15  2021.269  63.14286   63.47196
## 16  2021.288  62.71429   63.45057
## 17  2021.307  55.00000   63.43498
## 18  2021.327  54.00000   63.42361
## 19  2021.346  68.14286   63.41533
## 20  2021.365  62.85714   63.40929
## 21  2021.384  56.14286   63.40489
## 22  2021.403  63.14286   63.40168
## 23  2021.422  60.71429   63.39934
## 24  2021.442  52.71429   63.39764
## 25  2021.461  67.28571   63.39640
## 26  2021.480  55.85714   63.39549
## 27  2021.499  73.85714   63.39483
## 28  2021.518  81.85714   63.39435
## 29  2021.537  61.71429   63.39400
## 30  2021.557  68.85714   63.39374
## 31  2021.576  59.14286   63.39355
## 32  2021.595  63.28571   63.39342
## 33  2021.614  58.14286   63.39332
## 34  2021.633  64.00000   63.39325
## 35  2021.652  71.42857   63.39319
## 36  2021.672  69.42857   63.39315
## 37  2021.691  63.42857   63.39313
## 38  2021.710  62.85714   63.39311
## 39  2021.729  70.14286   63.39309
## 40  2021.748  57.85714   63.39308
## 41  2021.767  52.57143   63.39307
## 42  2021.787  67.00000   63.39307
## 43  2021.806  50.14286   63.39306
## 44  2021.825  61.00000   63.39306
## 45  2021.844 107.14286   63.39306
## 46  2021.863  73.42857   63.39306
## 47  2021.882  98.85714   63.39305
## 48  2021.901  63.42857   63.39305
## 49  2021.921 125.71429   63.39305
## 50  2021.940  82.57143   63.39305
## 51  2021.959  65.00000   63.39305
## 52  2021.978  74.42857   63.39305
## 53  2021.997  58.14286   63.39305
## 54  2022.016  83.71429   63.39305
## 55  2022.036  64.42857   63.39305
## 56  2022.055  64.57143   63.39305
## 57  2022.074  63.71429   63.39305
## 58  2022.093  69.57143   63.39305
## 59  2022.112  64.71429   63.39305
## 60  2022.131  58.85714   63.39305
## 61  2022.151  49.14286   63.39305
## 62  2022.170  55.42857   63.39305
## 63  2022.189  56.14286   63.39305
## 64  2022.208  51.71429   63.39305
## 65  2022.227  57.42857   63.39305
## 66  2022.246  50.57143   63.39305
## 67  2022.266  63.42857   63.39305
## 68  2022.285  44.57143   63.39305
## 69  2022.304  43.85714   63.39305
## 70  2022.323  62.71429   63.39305
## 71  2022.342  61.14286   63.39305
## 72  2022.361  49.28571   63.39305
## 73  2022.381  56.42857   63.39305
## 74  2022.400  58.85714   63.39305
## 75  2022.419  60.14286   63.39305
## 76  2022.438  61.28571   63.39305
## 77  2022.457  56.28571   63.39305
## 78  2022.476  62.00000   63.39305
## 79  2022.496  59.14286   63.39305
## 80  2022.515  76.14286   63.39305
## 81  2022.534  56.14286   63.39305
## 82  2022.553  57.85714   63.39305
## 83  2022.572  53.00000   63.39305
## 84  2022.591  50.28571   63.39305
## 85  2022.611  56.85714   63.39305
## 86  2022.630  57.42857   63.39305
## 87  2022.649  56.85714   63.39305
## 88  2022.668  62.28571   63.39305
## 89  2022.687  53.00000   63.39305
## 90  2022.706  50.85714   63.39305
## 91  2022.726  40.71429   63.39305
## 92  2022.745  56.71429   63.39305
## 93  2022.764  65.14286   63.39305
## 94  2022.783  58.00000   63.39305
## 95  2022.802  54.42857   63.39305
## 96  2022.821  58.85714   63.39305
## 97  2022.841  52.85714   63.39305
## 98  2022.860  50.57143   63.39305
## 99  2022.879  64.42857   63.39305
## 100 2022.898  77.71429   63.39305
## 101 2022.917  60.28571   63.39305
## 102 2022.936  54.28571   63.39305
## 103 2022.956  61.28571   63.39305
## 104 2022.975  99.00000   63.39305
## 105 2022.994  51.42857   63.39305
## 106 2023.013  48.42857   63.39305
## 107 2023.032  51.00000   63.39305
## 108 2023.051  58.00000   63.39305
## 109 2023.071  61.85714   63.39305
## 110 2023.090  60.14286   63.39305
## 111 2023.109  54.71429   63.39305
## 112 2023.128  55.42857   63.39305
## 113 2023.147  47.71429   63.39305
## 114 2023.166  48.14286   63.39305
## 115 2023.186  39.42857   63.39305
## 116 2023.205  48.57143   63.39305
## 117 2023.224  42.42857   63.39305
## 118 2023.243  49.14286   63.39305
## 119 2023.262  49.14286   63.39305
## 120 2023.281  63.85714   63.39305
## 121 2023.300  61.85714   63.39305
## 122 2023.320  70.28571   63.39305
## 123 2023.339  31.71429   63.39305
## 124 2023.358  52.42857   63.39305
## 125 2023.377  63.71429   63.39305
## 126 2023.396  44.57143   63.39305
## 127 2023.415  46.00000   63.39305
## 128 2023.435  38.42857   63.39305
## 129 2023.454  46.85714   63.39305
## 130 2023.473  50.85714   63.39305
## 131 2023.492  62.14286   63.39305
## 132 2023.511  88.85714   63.39305
## 133 2023.530  58.71429   63.39305
## 134 2023.550  60.42857   63.39305
## 135 2023.569  65.14286   63.39305
## 136 2023.588  62.14286   63.39305
## 137 2023.607  54.60000   63.39305
summary(arima_model_weekly)
## Series: train_weekly 
## ARIMA(1,1,2) 
## Box Cox transformation: lambda= -0.5880568 
## 
## Coefficients:
##          ar1      ma1     ma2
##       0.7291  -1.3756  0.3819
## s.e.  0.0568   0.0791  0.0776
## 
## sigma^2 = 0.0003005:  log likelihood = 3021.24
## AIC=-6034.48   AICc=-6034.44   BIC=-6014.3
## 
## Training set error measures:
##                   ME     RMSE     MAE       MPE     MAPE      MASE       ACF1
## Training set 1.61547 15.94283 11.5259 -1.993959 16.06187 0.7131783 0.02621965
# Assuming your residuals are stored in a variable called "residuals"
shapiro_test_result <- shapiro.test(arima_model_weekly$residuals)

# Print the test result
print(shapiro_test_result)
## 
##  Shapiro-Wilk normality test
## 
## data:  arima_model_weekly$residuals
## W = 0.99235, p-value = 1.168e-05
# Check the p-value
if (shapiro_test_result$p.value < 0.05) {
  cat("The residuals are not normally distributed (p-value:", shapiro_test_result$p.value, ")\n")
} else {
  cat("The residuals appear to be normally distributed (p-value:", shapiro_test_result$p.value, ")\n")
}
## The residuals are not normally distributed (p-value: 1.167989e-05 )

ARIMA model using auto.arima() - Monthly

# Convert monthly data to a time series object
# pm2_5_monthly_ts <- ts(pm2_5_monthly["19990131/20230811"], start = c(1999, 1), frequency = 12)

y_values_monthly <- train_monthly
optimal_lambda <- BoxCox.lambda(y_values_monthly)

# Apply the Box-Cox transformation
transformed_data_monthly <- if (optimal_lambda != 0) {
  (y_values_monthly^optimal_lambda - 1) / optimal_lambda
} else {
  log(y_values_monthly)
}

# Check stationarity, perform differencing if needed
kpss_test_result <- kpss.test(pm2_5_monthly_ts)
## Warning in kpss.test(pm2_5_monthly_ts): p-value smaller than printed p-value
adf_test_result <- adf.test(pm2_5_monthly_ts)
## Warning in adf.test(pm2_5_monthly_ts): p-value smaller than printed p-value
if (kpss_test_result$p.value < 0.05 || adf_test_result$p.value < 0.05) {
  pm2_5_monthly_diff <- diff(pm2_5_monthly_ts)
} else {
  pm2_5_monthly_diff <- pm2_5_monthly_ts
}

# Check stationarity, perform differencing if needed
kpss_test_result <- kpss.test(transformed_data_monthly)
## Warning in kpss.test(transformed_data_monthly): p-value smaller than printed
## p-value
adf_test_result <- adf.test(transformed_data_monthly)
## Warning in adf.test(transformed_data_monthly): p-value smaller than printed
## p-value
if (kpss_test_result$p.value < 0.05 || adf_test_result$p.value < 0.05) {
  pm2_5_monthly_diff <- diff(transformed_data_monthly)
  
  # Check stationarity after first differencing
  kpss_test_result_diff <- kpss.test(pm2_5_monthly_diff)
  adf_test_result_diff <- adf.test(pm2_5_monthly_diff)
  
  if (kpss_test_result_diff$p.value < 0.05 || adf_test_result_diff$p.value < 0.05) {
    # If still non-stationary, apply second-order differencing
    pm2_5_monthly_diff <- diff(pm2_5_monthly_diff)
    
    print("First and Second Order Differencing was performed")
  }
} else {
  pm2_5_monthly_diff <- transformed_data_monthly
  print("First Order Differencing was performed")
}
## Warning in kpss.test(pm2_5_monthly_diff): p-value greater than printed p-value
## Warning in adf.test(pm2_5_monthly_diff): p-value smaller than printed p-value
## [1] "First and Second Order Differencing was performed"
# Fit ARIMA model using auto.arima()
arima_model_monthly <- auto.arima(pm2_5_monthly_diff, seasonal = TRUE)

# Forecast using ARIMA model
h <- length(test_monthly)
forecast_arima_monthly <- forecast(arima_model_monthly, h = h)

original_scale_forecasts_monthly <- InvBoxCox(forecast_arima_monthly$mean, lambda = optimal_lambda)

print(cat("Optimal Lambda Value:", optimal_lambda, "\n"))
## Optimal Lambda Value: -0.09958836 
## NULL
# Plot the original data
original_scale_forecasts_monthly <- original_scale_forecasts_monthly * 100
plot(train_monthly, main = "Original Data and Forecasts")
lines(original_scale_forecasts_monthly, col = "blue")  # Add forecast line in blue

# Add legend
legend("topleft", legend = c("Original Data", "Forecasts"), col = c("black", "blue"), lwd = 1)

checkresiduals(arima_model_monthly)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,0,2)(0,0,2)[12] with non-zero mean
## Q* = 35.946, df = 17, p-value = 0.004663
## 
## Model df: 7.   Total lags used: 24
#Ljung-Box test .05179

plot(pm2_5_monthly_diff, xlim=c(1999, 2023), ylim=c(min(pm2_5_monthly_diff), max(pm2_5_monthly_diff)), main="Train, Test, and Forecasted Data - Monthly", xlab="Year", ylab="PM2.5 AQI Value")
lines(test_monthly, col="red") # Test data in red
lines(original_scale_forecasts_monthly, col="blue", type="o") # Forecasted data in blue
legend("topleft", legend=c("Train", "Test", "Forecast"), fill=c("black", "red", "blue"))

mae <- mean(abs(test_monthly - original_scale_forecasts_monthly))
print(cat("Mean Absolute Error (MAE):", round(mae, 2), "\n"))
## Mean Absolute Error (MAE): 39.61 
## NULL
#Mean Absolute Error (MAE): 9.31 

comparison_df <- data.frame(
  Date = time(test_monthly),
  Actual = as.vector(test_monthly),
  Forecasted = as.vector(original_scale_forecasts_monthly)
)
print(comparison_df)
##        Date   Actual Forecasted
## 1  2021.000 69.25806   88.03185
## 2  2021.083 60.78571   99.66849
## 3  2021.167 49.09677  100.36669
## 4  2021.250 58.90000  110.35343
## 5  2021.333 62.45161   99.55594
## 6  2021.417 59.50000   97.38366
## 7  2021.500 69.90323  105.60004
## 8  2021.583 64.67742   95.63940
## 9  2021.667 64.93333  104.14763
## 10 2021.750 57.61290   94.36865
## 11 2021.833 86.50000   99.47679
## 12 2021.917 80.35484  101.88149
## 13 2022.000 69.90323   99.58545
## 14 2022.083 60.14286   98.46090
## 15 2022.167 53.48387   97.13609
## 16 2022.250 53.73333  106.51176
## 17 2022.333 56.87097  100.76623
## 18 2022.417 60.23333   96.62587
## 19 2022.500 60.32258  103.50100
## 20 2022.583 55.70968   99.18585
## 21 2022.667 52.30000  101.98734
## 22 2022.750 58.70968   94.54031
## 23 2022.833 61.63333  100.31511
## 24 2022.917 66.22581  103.37849
## 25 2023.000 53.80645   97.26432
## 26 2023.083 54.42857  101.04354
## 27 2023.167 44.80645  100.17232
## 28 2023.250 60.83333  100.08646
## 29 2023.333 46.96774  100.05663
## 30 2023.417 48.60000  100.02717
## 31 2023.500 68.41935  100.01440
## 32 2023.583 58.36364  100.00776